欢迎关注“小丫画图”公众号,回复“小白”,看小视频,实现点鼠标跑代码。
小丫微信: epigenomics E-mail: figureya@126.com
作者:Hazard
小丫编辑校验
单细胞CNV的计算和画图。
Figure 1. Characterizing Intra-tumoral Expression Heterogeneity in HNSCC by Single-Cell RNA-Seq. (B) Heatmap shows large-scale CNVs for individual cells (rows) from a representative tumor (MEEI5), inferred based on the average expression of 100 genes surrounding each chromosomal position (columns). Red: amplifications; blue: deletions.
出自https://www.cell.com/cell/fulltext/S0092-8674(17)31270-9
其他文章里类似的图:
Fig. 1. Dissection of melanoma with single-cell RNA-seq. (B) Chromosomal landscape of inferred large-scale CNVs allows us to distinguish malignant from nonmalignant cells. The Mel80 tumor is shown with individual cells (y axis) and chromosomal regions (x axis). Amplifications (red) or deletions (blue) were inferred by averaging expression over 100-gene stretches on the respective chromosomes.
用单细胞RNA-seq数据计算CNV,对比展示多组之间的差异。
使用国内镜像安装包
options("repos"= c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")
加载包
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## Warning: package 'AnnotationDbi' was built under R version 3.6.1
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 3.6.1
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 3.6.1
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
##
library(magrittr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caTools)
##
## Attaching package: 'caTools'
## The following object is masked from 'package:IRanges':
##
## runmean
## The following object is masked from 'package:S4Vectors':
##
## runmean
library(pheatmap)
Sys.setenv(LANGUAGE = "en") #显示英文报错信息
options(stringsAsFactors = FALSE) #禁止chr转成factor
自定义函数
minmax <- function(x, min, max ){
x[x > max] <- max
x[x < min] <- min
return(x)
}
GSE103322_HNSCC_all_data.txt.gz,这里以原文的数据为例,其他单细胞RNA-seq数据与此类似,包含表达矩阵和样品分组等信息meta data。
Download GSE103322_HNSCC_all_data.txt.gz from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE103322,已上传到微云,下载链接https://share.weiyun.com/5gsuteR
# read cell annotation information
annodata <- as.data.frame(t(
read.table(file = gzfile("GSE103322_HNSCC_all_data.txt.gz"),
sep = "\t", header = T, row.names = 1, nrows = 5)
))
annodata[1:3,1:3]
## processed by Maxima enzyme Lymph node
## HN28_P15_D06_S330_comb 1 1
## HN28_P6_G05_S173_comb 1 0
## HN26_P14_D11_S239_comb 1 1
## classified as cancer cell
## HN28_P15_D06_S330_comb 0
## HN28_P6_G05_S173_comb 0
## HN26_P14_D11_S239_comb 1
# authors did not provide tumor ID for each single cell? horriable...
tmp <- reshape2::colsplit(gsub(pattern = "HNSCC_17", replacement = "HNSCC17", x = rownames(annodata)),
"_", names = letters[1:10]) #
annodata$tumor <- gsub(pattern = "HN|HNSCC", replacement = "MEEI", tmp$a, ignore.case = T)
head(tmp)
## a b c d e f g h i j
## 1 HN28 P15 D06 S330 comb NA NA NA
## 2 HN28 P6 G05 S173 comb NA NA NA
## 3 HN26 P14 D11 S239 comb NA NA NA
## 4 HN26 P14 H05 S281 comb NA NA NA
## 5 HN26 P25 H09 S189 comb NA NA NA
## 6 HN26 P14 H06 S282 comb NA NA NA
# read normalized expression values
exprdata <- read.table(file = gzfile("GSE103322_HNSCC_all_data.txt.gz"),
sep = "\t", header = T, row.names = 1, skip = 5)
colnames(exprdata) <- rownames(annodata)
exprdata[1:3, 1:3]
## HN28_P15_D06_S330_comb HN28_P6_G05_S173_comb HN26_P14_D11_S239_comb
## C9orf152 0.0000 0.0000 0.42761
## RPS11 6.0037 7.3006 7.28850
## ELMO2 0.0000 0.0000 0.00000
# you may save variables into a Rdata file for quicker data reloading.
save(exprdata, annodata, file = "data.Rdata")
Load and preprocess data
We used the remaining cells (k = 5,902) to identify genes that are expressed at high or intermediate levels by calculating the aggregate
expression of each gene i across the k cells, as Ea(i) = log2(average(TPM(i)1.k)+1), and excluded genes with Ea < 4. For the remaining
cells and genes, we defined relative expression by centering the expression levels, Eri,j = Ei,j-average[Ei,1.k]. The relative
expression levels, across the remaining subset of cells and genes, were used for downstream analysis.
varList <- load(file = "data.Rdata")
varList
## [1] "exprdata" "annodata"
# all cells are high quality with more than 2000 genes in individual cell.
annodata$gene_number <- colSums(exprdata > 0)
# filter genes with high or intermediate expression levels
# Note: This step of filtering out lowly expressed genes is and necessary, otherwise the CNV estimation result may de distorted.
tpmdata <- 10*(2^exprdata-1)
gene_average <- apply(tpmdata, 1, function(x){ log2(mean(x)+1)})
gene_mask <- gene_average > 4
sum(gene_mask)/length(gene_mask)
## [1] 0.3113231
exprdata <- exprdata[gene_mask,]
# sort genes by genomic location
gene_loc <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(exprdata),
columns = c( "CHRLOC"),
keytype = "SYMBOL")
## Warning in .deprecatedColsMessage(): Accessing gene location information via 'CHR','CHRLOC','CHRLOCEND' is
## deprecated. Please use a range based accessor like genes(), or select()
## with columns values like TXCHROM and TXSTART on a TxDb or OrganismDb
## object instead.
## 'select()' returned 1:many mapping between keys and columns
chr_used <- c(as.character(1:22),"X")
gene_loc %<>%
dplyr::filter(CHRLOCCHR %in% chr_used) %<>% # filter out scaffold
dplyr::mutate(chr = factor(CHRLOCCHR, levels = chr_used)) %<>% # set levels for chr
dplyr::arrange(chr, abs(CHRLOC)) # sort genes by genomic location
gene_loc_uniq <- gene_loc[!duplicated(gene_loc$SYMBOL),] # gene deduplication
# set row-order of exprdata
exprdata <- exprdata[gene_loc_uniq$SYMBOL, ]
# get relative expression by centering (note: no scaling)
reladata <- sweep(exprdata, 1, rowMeans(exprdata))
# bound data into [-3, 3]
reladata <- minmax(reladata, -3, 3)
window_length <- 100
# initial CNVs
initial_cnv <- NA + reladata # genes with location information were used for downstream analyses
for (chr in chr_used) {
chr_genes = gene_loc_uniq$SYMBOL[gene_loc_uniq$chr == chr]
chr_data = reladata[chr_genes, , drop = FALSE]
if (nrow(chr_data) > 1) {
chr_data = apply(chr_data, 2, caTools::runmean, k = window_length)
initial_cnv[chr_genes, ] <- chr_data
}else{
print(chr)
}
}
which(is.na(initial_cnv)) # check inistal_cnv
## integer(0)
# re-centering data across chromosome after smoothing, see (Patel et al., 2014)
initial_cnv <- sweep(initial_cnv, 2, apply(initial_cnv, 2, median), FUN = "-")
# initial CNV score of each single-cell
initial_cnv_score <- colMeans(initial_cnv^2)
# initial CNV correlation score
cell_in_which_tumor <- annodata$tumor
initial_cnv_score_tumor_profile <- sapply(unique(cell_in_which_tumor), function(x){
rowMeans(initial_cnv[, cell_in_which_tumor == x])
})
initial_cnv_corr <- sapply(colnames(initial_cnv), function(x) {
cor(x = as.numeric(initial_cnv[, x, drop = T]), y = initial_cnv_score_tumor_profile[,annodata[x,"tumor"]])
})
initial_cnv_score_threshold <- 0.05
initial_cnv_corr_threshold <- 0.5
# putative maglignant cells
initital_putative_maglignant <- names(which(
initial_cnv_score > initial_cnv_score_threshold &
initial_cnv_corr > initial_cnv_corr_threshold
))
# putative non-maglignant cells
initital_putative_non_maglignant <- names(which(
initial_cnv_score < initial_cnv_score_threshold &
initial_cnv_corr < initial_cnv_corr_threshold
))
table(annodata[initital_putative_maglignant, "classified as cancer cell"])
##
## 0 1
## 1 1323
table(annodata[initital_putative_non_maglignant, "classified as non-cancer cells"])
##
## 0 1
## 746 3008
annodata_base <- annodata[initital_putative_non_maglignant, c("non-cancer cell type"), drop = F]
table(annodata_base$`non-cancer cell type`)
##
## -Fibroblast 0 B cell Dendritic Endothelial Fibroblast
## 11 746 125 34 244 1328
## Macrophage Mast myocyte T cell
## 97 117 19 1033
types_used <- c("B cell", "Dendritic", "Endothelial", "Fibroblast", "Macrophage", "Mast", "myocyte", "T cell")
annodata_base <- annodata_base[annodata_base$`non-cancer cell type` %in% types_used,,drop = F]
# baseline
baseline_cnv <- as.matrix(t(apply(initial_cnv[,rownames(annodata_base)], 1, function(x) {
tapply(x, annodata_base$`non-cancer cell type`, mean)
})))
baseline_max <- matrix(rowMax(baseline_cnv),
nrow = nrow(initial_cnv),
ncol = ncol(initial_cnv),
dimnames = dimnames(initial_cnv))
baseline_min <- matrix(rowMin(baseline_cnv),
nrow = nrow(initial_cnv),
ncol = ncol(initial_cnv),
dimnames = dimnames(initial_cnv))
# final CNVs
final_cnv <- 0* initial_cnv
final_cnv[initial_cnv > baseline_max + 0.2] <- (initial_cnv - baseline_max)[initial_cnv > baseline_max + 0.2]
final_cnv[initial_cnv < baseline_min - 0.2] <- (initial_cnv - baseline_min)[initial_cnv < baseline_min - 0.2]
# re-centering data across chromosome after smoothing
# final_cnv <- sweep(final_cnv, 2, apply(final_cnv, 2, median), FUN = "-")
用pheatmap画图
annodata$type <- factor(ifelse(annodata$`classified as non-cancer cells` == 1,
"Non-malignant",
ifelse(annodata$`classified as cancer cell` == 1 & annodata$`Lymph node` == 0,
"Malignant (primary)",
"Malignant (LN)"
)),
levels = c("Non-malignant", "Malignant (primary)","Malignant (LN)"))
phAnno <- annodata[annodata$tumor == "MEEI5",]
phAnno <- phAnno[order(phAnno$type),]
phData <- t(final_cnv[,rownames(phAnno)])
phData <- minmax(phData, min = -1, max = 1)
pheatmap(phData,
color = colorRampPalette(c("darkblue", "blue", "grey90", "red", "red4"),
interpolate = "linear")(11),
# breaks = c(seq(-1, -0.1, length.out = 50),
# seq(0.1, 1, length.out = 50)),
annotation_row = phAnno[, "type", drop = F],
gaps_col = cumsum(table(gene_loc_uniq$chr)),
cluster_rows = F, cluster_cols = F,
show_colnames = F, show_rownames = F,
filename = "scCNV.pdf")